qui {

noisily {
	/*************************************************/
	/********** Step 1.6. Human capital **************/
	/*************************************************/

/* 1.6.1. Educational attainment */
/*
To gain panel data on the evolution of educational attainment, our strategy is the following:
	- Collect educational attainment from various sources (secondary enrollment rates & average years of education)
	- Linearly interpolate missing values in all data sources
	- Select barleeeduc data on average years of education as baseline series
	- As this series ends in 2010, extend using UNDP data on average years of education
	- Use additional data on enrollement rates in secondary and tertiary education to extend baseline data:
		- approximate using third order polynomial (region, year, alternative data series)
		- extend using growth rates implied in approximations
	- Result = composite index of educ, expressed in average years of education
*/
}

/* 1.6.1.1. Load data */
cd ..
cd ".\3. Intermediary results"

	* Barro & Lee (1994)
	merge 1:1 cntrycode_barlee year using Barlee, keepusing(ays_barlee senroll_barlee)
	destring ays_barlee, force replace
	destring senroll_barlee, force replace
	drop if _merge == 2
	drop _merge
	
	* Barro & Lee (2017)
	merge 1:1 cntrycode_barlee2017 year using Barlee2017, keepusing(ays_barlee2017)
	destring ays_barlee2017, force replace
	drop if _merge == 2
	drop _merge
	
	* CLIO (2017)
	merge 1:1 cntrycode_CLIO year using CLIO_ays, keepusing(ays_CLIO)
	destring ays_CLIO, force replace
	drop if _merge == 2
	drop _merge
	
	* UNDP (2017)
	merge 1:1 cntrycode_UNDP year using UNDP, keepusing(ays_UNDP)
	destring ays_UNDP, force replace
	drop if _merge == 2
	drop _merge
	
	* WDI (2017)
	merge 1:1 cntrycode_WDI year using WDI, keepusing(senroll_WDI)
	destring senroll_WDI, force replace
	drop if _merge == 2
	drop _merge
	
	rename ays_CLIO educ_1
	rename ays_UNDP educ_2
	rename ays_barlee2017 educ_3
	rename ays_barlee educ_4
	rename senroll_WDI educ_5
	rename senroll_barlee educ_6	
	
/* 1.6.1.2. Label data */

label var educ_1 "Average years of education, source: CLIO"
label var educ_2 "Average years of education, source: UNDP"
label var educ_3 "Average years of education, source: barlee2017"
label var educ_4 "Average years of education, source: barlee"
label var educ_5 "Enrollment secondary education, source: WDI"
label var educ_6 "Enrollment secondary education, source: Barlee"

/* 1.6.1.3. Linearly interpolate missing values in all datasets */			
sort cntry year
local source = 0

foreach num of numlist 1/6 {
	local source = `source'+1
	local source2 = `source'+900
	gen interpolated_educ_`num' = 0 if educ_`num' != .
	gen educ_`num'_raw = educ_`num' 								// Keep raw data, to track number of linearly interpolated contributions
	by cntry: ipolate educ_`num' year, generate(educ_`num'1)
	sum cntrycode if independence_years != . & year >= $startyear & educ_`num' == . & educ_`num'1 != .
	replace educ_`num' = educ_`num'1
	drop educ_`num'1
	replace interpolated_educ_`num' = 1 if interpolated_educ_`num' == . & educ_`num' != .
	}
	
* Drop single-observation series
sum cntrycode
foreach cntry of numlist 1/`r(max)' {
	foreach series of numlist 1/6 {
		sum educ_`series' if cntrycode == `cntry'
		if `r(N)' == 1 {
			di `series'
			replace educ_`series' = . if cntrycode == `cntry' & `r(N)' == 1
			}
		}
	}
	
* Clean some suspicious data points
noi di "Datapoints set to missing (reason: inflated enrollment rates compared to rest of the world):"
noi list cntry year educ_5 if cntry == "Federated States of Micronesia" & educ_5 != .
replace educ_5 = . if cntry == "Federated States of Micronesia" // According to the WDI data, Micronesia had the highest enrollment rates in the world between 1971 & 1990
	
/* 1.6.1.3. Use CLIO data on ays as baseline */
	
	sum educ_*
	gen educ = educ_1
	gen source_educ = 1 if educ != .
	scalar correlation_educ_1 = 1
	
	* Identify interpolated parts
	gen interpolated_educ = 0 & educ != .
	replace interpolated_educ = 1 if interpolated_educ_1 == 1 & educ != .

	* Identify extrapolated parts
	gen extrapolated_educ = 0  if educ != .

	* identify polynomial predicted parts
	gen polynomialpredicted_educ = 0  if educ != .

/* 1.6.1.4. Maximally extend forward and backward by growth rates implied in UNDP data */

	noi di "Correlation with baseline series (CLIO)"
	noisily corr educ_2 educ if independence_years != . & year >= $startyear
	scalar correlation_educ_2 = round(`r(rho)',.01)

	xtset cntrycode year
			
	foreach dataset of numlist 2 {
																				
		* Extend forward 
		local i = 999
		while `i' != 0 {
			sum educ  if independence_years != . & year >= $startyear
			local original = r(N)
			replace educ = (1+(educ_`dataset'-L.educ_`dataset')/L.educ_`dataset')*L.educ if educ == .
			sum educ  if independence_years != . & year >= $startyear
			local extended = r(N)
			local i = `extended' - `original'
			}
								
		* Extend backward
		local i = 999
		while `i' != 0 {
			sum educ if independence_years != . & year >= $startyear
			local original = r(N)
			replace educ = (1+(educ_`dataset'-F.educ_`dataset')/F.educ_`dataset')*F.educ if educ == .
			sum educ  if independence_years != . & year >= $startyear
			local extended = r(N)
			local i = `extended' - `original'
			}

		* Identify source
		replace source_educ = 992 if source_educ == . & educ != .
		
		* Identify interpolated parts
		replace interpolated_educ = 1  if educ != . & interpolated_educ_`dataset' == 1 & interpolated_educ == .
					
		* Identify extrapolated parts
		replace extrapolated_educ = 1  if educ != . & extrapolated_educ == .
		
		* Extend for countries missing in reference data series
		replace educ = educ_`dataset' if educ == .  & independence_years != . & year >= $startyear

		* Identify source
		replace source_educ = 2 if source_educ == . & educ != .
		
		* Identify interpolated parts
		replace interpolated_educ = 1  if educ != . & interpolated_educ_`dataset' == 1 & interpolated_educ == .
				
		* Identify extrapolated parts
		replace extrapolated_educ = 0 if educ != . & extrapolated_educ == .
			
		* Identify polynomial predicted parts
		replace polynomialpredicted_educ = 0 if educ != . & polynomialpredicted_educ == .
		}



/* 1.6.1.5. Predict missing country-years using other sources */

	* Show correlation with baseline data
	noi di "Correlation with baseline series (CLIO+UNDP)"
	foreach dataset of numlist 3 4 5 6 {
		noisily corr educ_`dataset' educ if independence_years != . & year >= $startyear
		scalar correlation_educ_`dataset' = round(`r(rho)',.01)
		}

		
	noi di "Calculate predictions based on dataset:", _continue

	foreach dataset of numlist 3 4 5 6 {
		
		noi di "`dataset'", _continue

		* Generate indicators:
					
			* Squared and cubic term educational attainment indicator:
			
			gen educ_`dataset'_sq = educ_`dataset'^2
			gen educ_`dataset'_cu = educ_`dataset'^3
					
			* Year dummies
			
			tab year if year >= $startyear, gen(year_)
						
			local tyear = 2016 - $startyear + 1
			forval i = 1/`tyear' {
				local j = `i' + $startyear - 1
				rename year_`i' year_`j'
				}

			* Region dummies
			
			tab regioncode, gen(region_)

			* Interactions:
						
				* region and year				
			
				forval i = ${startyear}/2016 {
					forval j = 1/7 {
						gen year_`i'Xregion_`j' = year_`i' * region_`j'
						}
					}
							
				* year and educational attainment
				
				forval i = ${startyear}/2016 {
					gen year_`i'Xeduc_`dataset' = year_`i' * educ_`dataset'
					gen year_`i'Xeduc_`dataset'_sq = year_`i' * educ_`dataset' * educ_`dataset'
					}
							
				* region and  educational attainment
				
				forval j = 1/7 {
					gen region_`j'Xeduc_`dataset' = region_`j' * educ_`dataset'
					gen region_`j'Xeduc_`dataset'_sq = region_`j' * educ_`dataset' * educ_`dataset'
					}
								
				* year, region and educational attainment
				
					forval i = ${startyear}/2016 {
						forval j = 1/7 {
						gen year_`i'Xregion_`j'Xeduc_`dataset' = year_`i' * region_`j' * educ_`dataset'
						}
					}

				* Predictions: 
				reg educ educ_`dataset' educ_`dataset'_sq educ_`dataset'_cu region_* year_* if independence_years != . & year >= $startyear
				scalar appr_educ_`dataset'_r2 = round(e(r2_a), .001)
				sum year if educ != .
				local maxyear = `r(max)'
				predict appr_educ_`dataset' if independence_years != . & year >= $startyear & year <= `maxyear'
				
				predict appr_educ_`dataset'_se if independence_years != . & year >= $startyear, stdp
				local N = e(df_r)
				gen appr_educ_`dataset'_ub = appr_educ_`dataset' + invttail(`N',.005)*appr_educ_`dataset'_se
				gen appr_educ_`dataset'_lb = appr_educ_`dataset' - invttail(`N',.005)*appr_educ_`dataset'_se
				gen predictive_accuracy_`dataset' = .
				replace predictive_accuracy_`dataset' = 1 if educ !=. & appr_educ_`dataset' != . & appr_educ_`dataset'_lb <= educ & educ <= appr_educ_`dataset'_ub
				replace predictive_accuracy_`dataset' = 0 if educ !=. & appr_educ_`dataset' != . & appr_educ_`dataset'_lb > educ | educ !=. & appr_educ_`dataset' != . & educ > appr_educ_`dataset'_ub
				replace predictive_accuracy_`dataset' = 0 if educ !=. & appr_educ_`dataset' != . & appr_educ_`dataset'_lb > educ | educ !=. & appr_educ_`dataset' != . & educ > appr_educ_`dataset'_ub
				sum predictive_accuracy_`dataset' if drop == 0
				scalar predictive_accuracy_`dataset' = round(r(mean),.0001)
						
				* Drop unnecesaries:
				
				drop year_* region_* educ_`dataset'_sq educ_`dataset'_cu appr_educ_`dataset'_* predictive_*
				}
			
			noi di ""
			
/* 1.6.1.6. Report implications */
			
noi di "Correlation of predictions with reference data:"
foreach dataset of numlist 3/6 {
	noi corr educ appr_educ_`dataset' if independence_years != . & year >= $startyear
	}

noi di "Predictive accuracy:"
foreach dataset of numlist 3/6 {
	noi scalar list predictive_accuracy_`dataset'
	}
					
noi di "Adjusted R squared of regresions"
foreach dataset of numlist 3/6 {
	noi scalar list appr_educ_`dataset'_r2 
	}
			
noi di "Approximated trajectories: summary statistics"
noi sum educ appr_educ_* if independence_years != . & year >= $startyear

/* 1.6.1.8. Extend baseline data */

xtset cntrycode year

noi di "Extend baseline data using dataset:", _continue

foreach dataset of numlist 3/6 {

	local source = `dataset'
	local source2 = `dataset'+900
	
	noi di "`dataset'", _continue

	* Extend forward 
	local i = 999
	while `i' != 0 {
		sum educ if independence_years != . & year >= $startyear
		local original = r(N)
		replace educ = (1+(appr_educ_`dataset'-L.appr_educ_`dataset')/L.appr_educ_`dataset')*L.educ if educ == .
		sum educ if independence_years != . & year >= $startyear
		local extended = r(N)
		local i = `extended' - `original'
		}
	
	* Extend backward
	local i = 999
	while `i' != 0 {
		sum educ if independence_years != . & year >= $startyear
		local original = r(N)
		replace educ = (1+(appr_educ_`dataset'-F.appr_educ_`dataset')/F.appr_educ_`dataset')*F.educ if educ == .
		sum educ if independence_years != . & year >= $startyear
		local extended = r(N)
		local i = `extended' - `original'
		}

	* Identify source
	replace source_educ = `source2' if source_educ == . & educ != .
	
	* Extend for countries missing in reference data series
	replace educ = appr_educ_`dataset' if educ == . & independence_years != . & year >= $startyear

	* Identify source
	replace source_educ = `source' if source_educ == . & educ != .
	
	* Identify interpolated parts
	replace interpolated_educ = 0  if educ != . & interpolated_educ == .
					
	* Identify extrapolated parts
	replace extrapolated_educ = 0 if educ != . & extrapolated_educ == .
					
	* Identify polynomial predicted parts
	replace polynomialpredicted_educ = 1 if educ != . & polynomialpredicted_educ == .
	}
	
	noi di ""
	
/* 1.6.1.9. Linearly interpolate missings between non-overlapping parts */

	sort cntry year
	by cntry: ipolate educ year, generate(educ1)
	replace educ = educ1
	drop educ1
	replace interpolated_educ = 1 if interpolated_educ == . & educ != .
	replace extrapolated_educ = 0 if educ != . & extrapolated_educ == .
	replace polynomialpredicted_educ = 0 if educ != . & polynomialpredicted_educ == .
	replace source_educ = 0 if source_educ == . & educ != .
	sort cntry year

/* 1.6.1.9. Report composition of data */

	noi di "Baseline data: summary statistics"
	noi sum educ if independence_years != . & year >= $startyear
	
								
/* 1.6.1.10. Label variables */

label var educ "Average years of education, composite index"

/* 1.6.1.11. Drop original data */

drop educ_* appr_*

/* 1.6.2. Health */
* Linearly interpolated average life expectancy in years in WDI, CLIO & barlee

/* 1.6.2.1. Load data */

	* Barro & Lee (1994)
	merge 1:1 cntrycode_barlee year using Barlee, keepusing(lexpec_barlee)
	destring lexpec_barlee, force replace
	drop if _merge == 2
	drop _merge
	
	* CLIO (2017)
	merge 1:1 cntrycode_CLIO year using CLIO_lexpec, keepusing(lexpec_CLIO)
	destring lexpec_CLIO, force replace
	drop if _merge == 2
	drop _merge
	
	* WDI (2017)
	merge 1:1 cntrycode_WDI year using WDI, keepusing(lexpec_WDI)
	destring lexpec_WDI, force replace
	drop if _merge == 2
	drop _merge

/* 1.6.2.2. Linearly interpolate missing values */
sort cntry year
local source = 0

foreach name in "WDI" "CLIO" "barlee" {
	local source = `source'+1
	local source2 = `source'+900
	gen interpolated_health_`name' = 0 if lexpec_`name' != .
	by cntry: ipolate lexpec_`name' year, generate(lexpec_`name'1)
	sum cntrycode if independence_years != . & year >= $startyear & lexpec_`name' == . & lexpec_`name'1 != .
	replace lexpec_`name' = lexpec_`name'1
	drop lexpec_`name'1
	replace interpolated_health_`name' = 1 if interpolated_health_`name' == . & lexpec_`name' != .
	}
				
/* 1.6.2.3. Compute average over all available sources */
foreach name in "WDI" "CLIO" "barlee" {
	gen lexpec_nm_`name' = lexpec_`name' if interpolated_health_`name' < 990
	}
egen health = rowmean(lexpec_nm_*)
gen source_health = ""
replace source_health = source_health+"1" if interpolated_health_WDI != .
replace source_health = source_health+"2" if interpolated_health_CLIO != .
replace source_health = source_health+"3" if interpolated_health_barlee != .
destring source_health, replace

gen interpolated_health = 0 if health != .
replace interpolated_health = 1 if interpolated_health_WDI == 1 | interpolated_health_CLIO == 1 | interpolated_health_barlee == 1
gen extrapolated_health = 0 if health != .
gen polynomialpredicted_health = 0 if health != .

drop lexpec_nm_*
egen health_help = rowmean(lexpec_*)
replace health = health_help if health == . & health_help != .

/* 1.6.2.4. Compute correlation with baseline data */
noi di "Correlation with baseline series (CLIO)"
scalar correlation_health_1 = 1
local source = 1
foreach dataset in "WDI" "barlee" {
	local source = `source' + 1
	noisily corr lexpec_`dataset' lexpec_CLIO
	scalar correlation_health_`source' = round(`r(rho)',.01)
	}

/* 1.6.2.4. Compute logs */
gen lhealth = ln(health + 1)

/* 1.6.2.5. Label relevant variables */
label var health "Life expectancy (various sources)"
label var lhealth "Log life expectancy"

/* 1.6.2.6. Drop original data */

drop lexpec_*

* Reroute to directory containing dofiles
cd ..
cd ".\1. Dofiles"
}
